This example uses simulated data to demonstrate how to include Date as a continuous regressor and Site as a discrete regressor for Etiology and False Positive Rate (FPR) in conditional independence model.
rm(list=ls())
library(baker)
library(lubridate)
library(rjags)
library(mgcv)
The function simulate_nplcm() can simulate data for conditional independence model by setting K = 2 and control/case subclass weights to c(1,0). Other parameter settings in this example include:
N.SITE = 2;J = 3 and the pathogens are named as A, B, C;MBS1 and a single slice of SS data MSS1;MBS1, while only pathogens A and B are tested in MSS1;c(0.95,0.9,0.85) for MBS1measurement and c(0.25,0.10,0.15) for MSS1 measurement.Nd = Nu = 1; Nd and Nu are forced to be non-zero in the function simulate_nplcm(). Actually we only need the measurement data for one subject (either case or control) each time we call the function. This can be achieved by removing redundant subject afterwards.In this example, we index the two sites as 1, 2 and assume dates varying from 1 to 300. Suppose we have 300 cases and 200 controls for each site. For each subject, date is randomly sampled from 1 to 300. The corresponding etiology probability (\(\pi\)) and FPR for MBS1 (\(\psi\)) are calculated according to the following additive regression models. The regression parameters for Etiology regression \(\beta_{ij}\) for \(i=0,1,2,3\) and \(j=1,...,J\) is specified in Eti.reg.paraMat.list. The regression parameters for FPR regression \(\alpha_{ij}\) for \(i=0,1,2,3\) and \(j=1,...,J\) is specified in FPR.reg.paraMat.list.
\[log\phi_j=\beta_{0j}+\beta_{1j}\mathbb{1}\{Site=1\}+\beta_{2j}(Date-150)+\beta_{3j}(Date-150)^2, \space \pi_j=\frac{\phi_j}{\sum_{k=1}^J\phi_k}\] \[\psi_j=\alpha_{0j}+\alpha_{1j}\mathbb{1}\{Site=1\}+\alpha_{2j}(Date-150)+\alpha_{3j}(Date-150)^2\]
The regressor SITE and DATE should be specified in data_nplcm$X as a data frame. Since simulate_nplcm() set the returned data_nplcm$X to NULL, we should manually define data_nplcm$X as required. Further, data_nplcm$X should also contain a column Y recording the case/control status, which should be the same asdata_nplcm$Y. After combining the data of all sites, we need to adjust the order of subjects proporly to make sure that the case group is on top of the control group, as required by the model fitting function nplcm().
J = 3 # number of causes
cause_list = c(LETTERS[1:J]) # cause list
N.SITE = 2 # number of sites
K = 2 # number of subclasses
lambda = c(1,0) # subclass weights for control group
eta = c(1,0) # subclass weights for case group
Nd = 300 # number of subjects for case group
Nu = 200 # number of subjects for control group
# etiology regression simulation
Eti.reg.paraMat.list = list(c(3,-1,0,(-2+1)/150^2),c(4,1,0,(-5+2)/150^2),c(1,0,0,0))
compute_Eti_given_date <- function(siteID,date){
Phi = sapply(Eti.reg.paraMat.list, function(para)
return(exp(para[1]+para[2]*(siteID==1)+para[3]*(date-150)+para[4]*(date-150)^2)))
Eti = Phi/sum(Phi)
return(Eti)
}
# FPR regression simulation
FPR.reg.paraMat.list = list(c(0.35,-0.05,0,-0.1/150^2),c(0.35,-0.05,0,-0.1/150^2),c(0.2,-0.05,0,-0.05/150^2))
compute_FPR_given_date <- function(siteID,date) {
return(sapply(FPR.reg.paraMat.list, function(para) return(para[1]+para[2]*(siteID==1)+para[3]*(date-150)+para[4]*(date-150)^2)))
}
set.seed(20181018)
data_nplcm_list = lapply(1:N.SITE,function(siteID){
fulldata_list_perSite = lapply(1:(Nd+Nu), function(caseID){
date = sample(1:300,1)
set_parameter <- list(
cause_list = cause_list,
etiology = compute_Eti_given_date(siteID,date),
pathogen_BrS = LETTERS[1:J],
SS = T,
pathogen_SS = LETTERS[1:2],
meas_nm = list(MBS = c("MBS1"),MSS=c("MSS1")),
Lambda = lambda, # for BrS
Eta = t(replicate(J,eta)), # case subclass weight for BrS
PsiBS = cbind(compute_FPR_given_date(siteID,date),
compute_FPR_given_date(siteID,date)), # FPR
PsiSS = cbind(rep(0,J),rep(0,J)),
ThetaBS = cbind(c(0.95,0.9,0.85), # TPR
c(0.95,0.9,0.85)),
ThetaSS = cbind(c(0.25,0.10,0.15),
c(0.25,0.10,0.15)),
Nu = 1,
Nd = 1
)
simu_out <- simulate_nplcm(set_parameter)
out <- simu_out$data_nplcm
out$X = data.frame(SITE=rep(siteID,2),DATE=rep(date,2))
return(out)
})
# extract cases for the first Nd data lists and extract controls for the last Nu data lists
data_perSite = list(
Mobs=list(
MBS = list(
MBS1 = Reduce(rbind, c(lapply(fulldata_list_perSite[1:Nd], function(l) l$Mobs$MBS$MBS1[1,]),
lapply(fulldata_list_perSite[(Nd+1):(Nd+Nu)], function(l) l$Mobs$MBS$MBS1[2,])))),
MSS = list(
MSS1 = Reduce(rbind, c(lapply(fulldata_list_perSite[1:Nd], function(l) l$Mobs$MSS$MSS1[1,]),
lapply(fulldata_list_perSite[(Nd+1):(Nd+Nu)], function(l) l$Mobs$MSS$MSS1[2,])))),
MGS = NULL),
Y = Reduce(c, c(lapply(fulldata_list_perSite[1:Nd], function(l) l$Y[1]),
lapply(fulldata_list_perSite[(Nd+1):(Nd+Nu)], function(l) l$Y[2]))),
X = Reduce(rbind, c(lapply(fulldata_list_perSite[1:Nd], function(l) l$X[1,]),
lapply(fulldata_list_perSite[(Nd+1):(Nd+Nu)], function(l) l$X[2,]))))
return(data_perSite)
})
# put cases on top of controls
data_nplcm_order = Reduce(c,lapply(1:N.SITE, function(i) ((i-1)*(Nd+Nu)+1):((i-1)*(Nd+Nu)+Nd)))
data_nplcm_order = c(data_nplcm_order, setdiff(1:((Nd+Nu)*N.SITE),data_nplcm_order))
data_nplcm = list(
Mobs = list(
MBS = list(MBS1 = Reduce(rbind, lapply(data_nplcm_list, function(l) l$Mobs$MBS$MBS1))[data_nplcm_order,]),
MSS = list(MSS1 = Reduce(rbind, lapply(data_nplcm_list, function(l) l$Mobs$MSS$MSS1))[data_nplcm_order,]),
MGS=NULL),
Y = Reduce(c, lapply(data_nplcm_list, function(l) l$Y))[data_nplcm_order],
X = Reduce(rbind, lapply(data_nplcm_list, function(l) l$X))[data_nplcm_order,])
# add the column Y recording case/control status
data_nplcm$X=cbind(data_nplcm$X,data.frame(Y=data_nplcm$Y))
We can visulize the seasonality trend of the positive rate of a given pathogen in MBS1 by calling the function visualize_season().For a selected pathogen, (pathogen B in the following example), it plots the conditional probability \(P(M_i^B=1|t_i)\) against date \(t_i\), separated by case group (in black) and control group (in blue), where the bars record \(M_i^B\). Further, the solid line is a fitted logistic regression line to model the trend. Note that the function uses data_nplcm$X$ENRLDATE as date by defualt, so we need to make sure that this column exists.
ENRLDATE.seq = seq(as.Date('2010-01-06'),as.Date('2010-11-01'),by=1) # 300 days
data_nplcm$X$ENRLDATE = ENRLDATE.seq[data_nplcm$X$DATE]
visualize_season(data_nplcm,"B")
model_options includes the following three aspects.
k_subclass = 1 indicates conditional independence model.Eti_formula = ~ s_date_Eti(DATE,Y,basis='ncs',3) +as.factor(SITE) is the regression formula for etiology. We will use natural cubic spline with degree of freedom 3 to model the effect of DATE and SITE is included as a discrete variable.FPR_formula = list(MBS1 = ~ s_date_FPR(DATE,Y,basis = "ps",3) + as.factor(SITE)) is the regression formula for the FPR in MBS1 measurement. We will use penalized slines with degree of freedom 3 to model the effect of DATE. Also, SITE is included as a discrete regressor.Eti_prior: a single, positive real number representing the standard deviation of beta coefficients in etiology regression. We set the standard deviation as 1 here.TPR_prior: informative priors with range 0.5 ~ 0.99 for MBS1 measurement and range 0.01 ~ 0.5 for MSS1 measurement are used in this caseBrS_object_1 <- make_meas_object(LETTERS[1:J],"MBS","1","BrS",cause_list)
SS_object_1 <- make_meas_object(LETTERS[1:2],"MSS","1","SS",cause_list)
model_options <- list(likelihood = list(cause_list = cause_list, # <---- fitted causes.
k_subclass = c(1), # <---- no. of subclasses
Eti_formula = ~ s_date_Eti(DATE,Y,basis='ncs',3) +as.factor(SITE),
FPR_formula = list(
MBS1 = ~ s_date_FPR(DATE,Y,basis = "ps",3) +as.factor(SITE)
)),
use_measurements = c("BrS","SS"), # <---- which measurements to use to inform etiology
prior = list(
Eti_prior = 1, # <--- etiology prior.
TPR_prior = list(
BrS = list(info = "informative",
input = "match_range",
val = list(
MBS1 = list(up = list(rep(0.99,length(BrS_object_1$patho))),
low = list(rep(0.5,length(BrS_object_1$patho)))
))
),
SS = list(info = "informative",
input = "match_range",
val = list(
MSS1 = list(up = list(rep(0.5,length(SS_object_1$patho))),
low = list(rep(0.01,length(SS_object_1$patho)))
))
)
) # <---- TPR prior.
)
)
assign_model(model_options,data_nplcm)
## $num_slice
## MBS MSS MGS
## 1 1 0
##
## $nested
## [1] FALSE
##
## $regression
## $regression$do_reg_Eti
## [1] TRUE
##
## $regression$do_reg_FPR
## $regression$do_reg_FPR$MBS1
## [1] TRUE
##
##
## $regression$is_discrete_predictor
## $regression$is_discrete_predictor$Eti
## [1] FALSE
##
## $regression$is_discrete_predictor$FPR
## MBS1
## FALSE
##
##
##
## $BrS_grp
## [1] FALSE
##
## $SS_grp
## [1] FALSE
Because we use JAGS for Bayesian inference of the models, we need to provide .bug files with model likelihood and prior. The package baker can interpret the specified models and write them in .bug files. In this case, baker uses write_model_Reg_NoNest() to write .bug file for conditional independence models with discrete regression, accommodating multiple bronze-standard and silver-standard data.
# parent directory for testing code (LOCAL):
working_dir <- tempdir()
# date stamp for analysis:
Date <- gsub("-", "_", Sys.Date())
# include stratification information in file name:
dated_strat_name <- paste0(working_dir,Date,"_continuous_predictor")
# create folder
result_folder <- dated_strat_name
dir.create(result_folder)
# options for MCMC chains:
mcmc_options <- list(debugstatus = TRUE,
n.chains = 1,
n.itermcmc = 10000,
n.burnin = 5000,
n.thin = 10,
individual.pred = !TRUE,
ppd = !TRUE,
get.pEti = !TRUE,
result.folder = result_folder,
bugsmodel.dir = result_folder,
jags.dir = "",
use_jags = TRUE)
# Record the settings of current analysis:
dput(data_nplcm,file.path(mcmc_options$result.folder,"data_nplcm.txt")) # <-- in case covariate data X are needed for plotting.
rjags::load.module("glm")
gs <- nplcm(data_nplcm,model_options,mcmc_options)
The file jagsdata.txt contains all data and parameters used when fitting the model, which is saved in bugs.dat as follows. The files CODAchain1.txt and CODAindex.txt together records the posterior samples for:
thetaBS_1: TPR for MBS1; J series corresponding to J pathogensthetaSS_1: TPR for MSS1; 2 series corresponding 2 pathogensbetaFPR_1: regression parameter for FPR for MBS1; 5*J series corresponding to 5 parameters per pathogen. The 5 parameters are intercept, coefficient for SITE and parameters in the penalized sline of DATE with degree of freedom 3.taubeta: the standard deviation of beta coefficients in FPR regressionbetaEti: regression parameter for Etiology probabilities; 5*J series corresponding to 5 parameters per pathogen. The 5 parameters are intercept, coefficient for SITE and parameters in the penalized sline of DATE with degree of freedom 3.The posterior samples can be extracted by coda::read.coda and saved in res_nplcm. The function print_res prints the traceplot for selected series. For instance, the following code prints the traceplots for betaEti.
# JAGS:
DIR_NPLCM <- result_folder
new_env <- new.env()
source(file.path(DIR_NPLCM,"jagsdata.txt"),local=new_env)
bugs.dat <- as.list(new_env)
rm(new_env)
res_nplcm <- coda::read.coda(file.path(DIR_NPLCM,"CODAchain1.txt"),
file.path(DIR_NPLCM,"CODAindex.txt"),
quiet=TRUE)
get_res <- function(x) res_nplcm[,grep(x,colnames(res_nplcm))]
print_res <- function(x) for (i in grep(x,colnames(res_nplcm))) plot(res_nplcm[,i],main=paste0('Trace of ',colnames(res_nplcm)[i]))
print_res("betaEti")
We can recover posterior etiology probabilities from posterior betaEti. Note that the transformed design matrix for Etiology regression has been saved in Z_Eti (which is derived by sourcing jagsdata.txt). First, we structure posterior betaEti as an array of dimension #posterior samples * #parameters * J. For each posterior sample, posterior etiology probabilities for each case can be calculated as normalized exp(Z_Eti%*%betaEti). Further, posterior Etiology mean and quantiles for each case can be calculated by marginalizing over all posterior samples.
# structure the posterior samples:
JBrS_1 <- ncol(bugs.dat$MBS_1)
n_samp_kept <- nrow(res_nplcm)
Jcause <- bugs.dat$Jcause
Nd <- bugs.dat$Nd
Nu <- bugs.dat$Nu
n_unique_Eti_level <- bugs.dat$n_unique_Eti_level
betaEti_post <- array(get_res("betaEti"),c(n_samp_kept,5,Jcause)) # of dimension (#samples * #paras * J)
Phi_post <- array(t(apply(betaEti_post,1,function(beta) exp(Z_Eti %*% beta))),c(n_samp_kept,dim(Z_Eti)[1],Jcause)) # of dimension (#samples * #cases * J)
Eti_post <- apply(Phi_post,c(1,2), function(Phi) Phi/sum(Phi)) # of dimension(J * #samples * #cases)
Eti_post_mean <- apply(Eti_post, c(1,3), mean) # of dimension(J * #cases)
Eti_post_q <- apply(Eti_post,c(1,3),quantile,c(0.025,0.975)) # # of dimension(2 * J * #cases)
Further, we can plot posterior etiology probabilities against date and compare it with our simulation assumption. In the following plot, red lines are for Site 1 while green lines are for Site 2. The solid lines represents the true relationship between date and Etiology, which we formulated in data simulation. The dots plot posterior Eiology mean against date and the boundary of the shadow area correponds to 2.5% and 97.5% posterior quantiles. From the plot, the natural cubic spline regression with degree of freedom 3 can roughly reproduce the quadratic additive formulation.
library(ggplot2)
library(dplyr)
plot_Eti_posterior <- function(EtiIndex) {
tmp = data.frame(Date = data_nplcm$X$DATE[1:600],
Site = data_nplcm$X$SITE[1:600],
Eti_mean = Eti_post_mean[EtiIndex,],
Eti_lq = Eti_post_q[1,EtiIndex,],
Eti_hq = Eti_post_q[2,EtiIndex,])
tmp = tmp%>%
mutate(Eti_true = sapply(1:600, function(i) compute_Eti_given_date(tmp$Site[i],tmp$Date[i])[EtiIndex]))
p<- ggplot(tmp) +
geom_point(aes(Date,Eti_mean,color=as.factor(Site)),size=0.5)+
geom_ribbon(aes(x=Date,ymin=Eti_lq,ymax=Eti_hq,color=as.factor(Site)),alpha=0.3)+
geom_line(aes(Date,Eti_true,color=as.factor(Site)),size=1)
return(p)
}
gridExtra::grid.arrange(grobs=lapply(1:Jcause, plot_Eti_posterior),nrow=3,ncol=1)